<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

cars.utf8

<!DOCTYPE html>

Cars Analysis

1. EDA

1.1 Knowing the data

To list the attributes of the data set.

attributes(cars)
## $names
## [1] "speed" "dist" 
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

To print the min, max, mean, median, and quartiles of each attribute.

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

To display the structure of your data set.

str(cars) 
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...

To get th names of the attributes within your data set.

names(cars) 
## [1] "speed" "dist"

1.2 Some Plotting . The most frequent speed of cars is between 10 to 20.

hist(cars$speed, 
     main="Speed distribution", xlab="Speed [mph]") 

The most frequent distance of most cars is between 20 to 40.

hist(cars$dist, 
     main="Stopping distance distribution", xlab="Stopping Distance [feet]")

Normal Quantile Plot is a way to see if your data is normally distributed. We can see that there is a positive correlation btw Speed and Distance

qqnorm(cars$speed) 

qqnorm(cars$dist)

plot(cars, col='blue', pch=20, cex=2, 
     main="Speed and Stopping Distance for 50 Cars", 
     xlab="Speed [mph]", ylab="Stopping Distance [feet]")

Missing Values? Will count how many NA’s you have.

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Will show your NA’s through logical data. (TRUE if it’s missing, FALSE if it’s not.)

is.na(cars) 
##       speed  dist
##  [1,] FALSE FALSE
##  [2,] FALSE FALSE
##  [3,] FALSE FALSE
##  [4,] FALSE FALSE
##  [5,] FALSE FALSE
##  [6,] FALSE FALSE
##  [7,] FALSE FALSE
##  [8,] FALSE FALSE
##  [9,] FALSE FALSE
## [10,] FALSE FALSE
## [11,] FALSE FALSE
## [12,] FALSE FALSE
## [13,] FALSE FALSE
## [14,] FALSE FALSE
## [15,] FALSE FALSE
## [16,] FALSE FALSE
## [17,] FALSE FALSE
## [18,] FALSE FALSE
## [19,] FALSE FALSE
## [20,] FALSE FALSE
## [21,] FALSE FALSE
## [22,] FALSE FALSE
## [23,] FALSE FALSE
## [24,] FALSE FALSE
## [25,] FALSE FALSE
## [26,] FALSE FALSE
## [27,] FALSE FALSE
## [28,] FALSE FALSE
## [29,] FALSE FALSE
## [30,] FALSE FALSE
## [31,] FALSE FALSE
## [32,] FALSE FALSE
## [33,] FALSE FALSE
## [34,] FALSE FALSE
## [35,] FALSE FALSE
## [36,] FALSE FALSE
## [37,] FALSE FALSE
## [38,] FALSE FALSE
## [39,] FALSE FALSE
## [40,] FALSE FALSE
## [41,] FALSE FALSE
## [42,] FALSE FALSE
## [43,] FALSE FALSE
## [44,] FALSE FALSE
## [45,] FALSE FALSE
## [46,] FALSE FALSE
## [47,] FALSE FALSE
## [48,] FALSE FALSE
## [49,] FALSE FALSE
## [50,] FALSE FALSE

Replace the missing values with the mean, which is common technique, but something to use with care with as it can skew the data.

#DatasetName$ColumnName[is.na(DatasetName$ColumnName)]<-mean(DatasetName$ColumnName,na.rm = TRUE)

Creating Testing and Training Sets

set.seed(122)

# These two lines calculate the sizes of each set but do not create the sets:
trainSize<-round(nrow(cars)*0.7) # 70/30%
testSize<-nrow(cars)-trainSize

# If you’d like to see how many instances will be in each set..
trainSize
## [1] 35
testSize
## [1] 15
# How do you create the train/test sets?. We also want these sets to be in a randomized order, which will create the most optimal model.
#   To perform this, you need to run these three lines of code. 
training_indices<-sample(seq_len(nrow(cars)),size =trainSize)
trainSet<-cars[training_indices,]
testSet<-cars[-training_indices,]



# Linear Regression
# *****************
 
# The basic line of code for the linear model function. 
cars_regression <-lm(formula = dist~ speed, trainSet)
# To see key metrics of your model:
summary(cars_regression)
## 
## Call:
## lm(formula = dist ~ speed, data = trainSet)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.565 -10.236  -2.565   9.379  42.929 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -13.082      8.563  -1.528    0.136    
## speed          3.582      0.506   7.080 4.18e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.75 on 33 degrees of freedom
## Multiple R-squared:  0.603,  Adjusted R-squared:  0.591 
## F-statistic: 50.13 on 1 and 33 DF,  p-value: 4.181e-08

Predictions

cars_predictions <- predict(cars_regression, testSet)
cars_predictions
##         1         3         8        11        14        16        17        18 
##  1.247655 11.994691 22.741727 26.324072 29.906418 33.488763 33.488763 33.488763 
##        20        22        30        32        33        38        49 
## 37.071108 37.071108 47.818144 51.400490 51.400490 54.982835 72.894561

Confidence in your predictions: In order to have an idea about the accuracy of the predictions, you can ask for intervals around your prediction. To get a matrix with the prediction and a 95 percent confidence interval around the mean prediction, you set the argument interval to ‘confidence’ like this:

predict(cars_regression,trainSet, interval='confidence')
##          fit        lwr      upr
## 48 72.894561  63.111447 82.67768
## 40 58.565180  51.814724 65.31564
## 9  22.741727  14.460418 31.02304
## 47 72.894561  63.111447 82.67768
## 44 65.729871  57.581248 73.87849
## 41 58.565180  51.814724 65.31564
## 34 51.400490  45.637230 57.16375
## 19 33.488763  27.210054 39.76747
## 31 47.818144  42.321088 53.31520
## 37 54.982835  48.791536 61.17413
## 10 26.324072  18.791390 33.85676
## 45 69.312216  60.368226 78.25621
## 12 29.906418  23.049158 36.76368
## 6  19.159382  10.074322 28.24444
## 39 58.565180  51.814724 65.31564
## 36 54.982835  48.791536 61.17413
## 4  11.994691   1.186675 22.80271
## 7  22.741727  14.460418 31.02304
## 2   1.247655 -12.321487 14.81680
## 46 72.894561  63.111447 82.67768
## 23 37.071108  31.245146 42.89707
## 42 58.565180  51.814724 65.31564
## 28 44.235799  38.819199 49.65240
## 21 37.071108  31.245146 42.89707
## 35 51.400490  45.637230 57.16375
## 25 40.653454  35.123448 46.18346
## 43 58.565180  51.814724 65.31564
## 24 40.653454  35.123448 46.18346
## 27 44.235799  38.819199 49.65240
## 26 40.653454  35.123448 46.18346
## 13 29.906418  23.049158 36.76368
## 15 29.906418  23.049158 36.76368
## 5  15.577037   5.646476 25.50760
## 29 47.818144  42.321088 53.31520
## 50 76.476907  65.821243 87.13257
predict(cars_regression,testSet, interval='prediction')
##          fit        lwr       upr
## 1   1.247655 -33.547942  36.04325
## 3  11.994691 -21.819880  45.80926
## 8  22.741727 -10.351959  55.83541
## 11 26.324072  -6.590260  59.23841
## 14 29.906418  -2.859937  62.67277
## 16 33.488763   0.838584  66.13894
## 17 33.488763   0.838584  66.13894
## 18 33.488763   0.838584  66.13894
## 20 37.071108   4.504963  69.63725
## 22 37.071108   4.504963  69.63725
## 30 47.818144  15.309228  80.32706
## 32 51.400490  18.845503  83.95548
## 33 51.400490  18.845503  83.95548
## 38 54.982835  22.349352  87.61632
## 49 72.894561  39.393497 106.39563
# rmarkdown::render("cars.html")